# Ryan Copus, Ryan Hübert and Paige Pellaton
# "Trading Diversity? Judicial Diversity and Case Outcomes in Federal Courts"
# American Political Science Review

# File name: chp_apsr_03_analysis.R
# Last revision date: March 24, 2024
# Questions or comments? Contact Ryan Hübert: https://ryanhubert.com/

# What does this script do?
# This script performs the main statistical analyses in the paper. 

# Last pre-print execution of this code: 
# > Date: March 24, 2024 
# > Machine: MacBook Pro 14" (2021 model) with Apple M1 Max chip and 64 GB RAM
# > OS: macOS Sonoma 14.4
# > R: version 4.3.2

################################################################################
# Before you run this script...
################################################################################

# Clear the workspace
rm(list = ls())

# Set this to TRUE if you wish to finish running a set of analyses that you 
# already began. Leave as FALSE if you are planning to re-run all analyses from 
# scratch (e.g., for replication).
to.finish <- FALSE

# This toggle allows you to not save the various outputs of this function
# so that you can do some debugging. Simply switch it to FALSE to do so.
to.save <- TRUE

# Do you want to run this using all your cores? If so, leave this as is. 
# If not, set `ncores` to 1 and it will run the code in sequence. You may wish 
# to do this if you do not have a lot of memory. The fewer cores you use, the 
# slower this will run. 
require(parallel)
ncores <- max(c(detectCores(),1))

################################################################################
# Load packages and set options
################################################################################

# Load packages required for analyses
require(tidyverse)
require(estimatr)

# Turn off messages from summarize function
options(dplyr.summarise.inform = FALSE)

################################################################################
# Directory management
################################################################################

# Define and set the working directory.
wdir <- gsub("/[Cc]ode/?","",getwd())
setwd(wdir)

# Define the directory that will store the results of the analyses.
odir <- paste0(wdir, "/outs")

if(to.finish){
  # If you have decided to finish an analysis you previously started, this will
  # find the latest set of analyses saved in the analysis folder
  cdt <- list.files(odir)
  cdt <- max(cdt[grepl("^20[0-9]+$", cdt)])
  
  cdir <- paste0(odir,"/",cdt)
  
  completed <- intersect(gsub(".csv", "", list.files(paste0(cdir,"/rows"))), 
                         gsub(".csv", "", list.files(paste0(cdir,"/regs"))))
  completed <- c(completed, list.files(paste0(cdir, "/_errs")))
  completed <- sort(unique(completed))
} else {
  # Create a subfolder in `outs` that has the date/time that these analyses
  # are run. This allows us to keep multiple versions and timestamp them.
  
  # What is the current date and time? This will be the name of the subfolder.
  cdt <- as.character(Sys.time())
  cdt <- gsub("[^0-9]+","",cdt)
  cdt <- gsub("^([0-9]{14}).*$","\\1", cdt)
  
  # Create several additional subfolders in cdir to store 
  cdir <- paste0(odir,"/",cdt)
  if(!dir.exists(cdir)){
    dir.create(cdir)
    dir.create(paste0(cdir, "/masks"))
    dir.create(paste0(cdir, "/regs"))
    dir.create(paste0(cdir, "/rows"))
    dir.create(paste0(cdir, "/_errs")) # to store model name when error returned
  }
  
  completed <- c()
}

################################################################################
# Load, merge and perform minor cleaning of the datasets used in the analyses
################################################################################

# Load the Copus, Hübert and Pellaton dataset of civil rights cases.
df <- read_csv(paste0(wdir,"/data/chp_apsr_case_data.csv"), show_col_types = FALSE)

# Drop all cases that were not in the IDB dataset or in our database of docket
# sheets.
df <- df[which(!is.na(df$file) & !is.na(df$CASE_ID)),]

# Drop all cases we suspect may not be randomly assigned (e.g. MDL cases, 
# related cases, visiting judges, etc.). See the paper for more details.
df <- df[is.na(df$to_drop), ]

# Convert outcome and president to dummy variables.
df$settlement <- ifelse(df$OUTCOME=="settlement", 1, 0)
df$dism_invol_jud_def <- ifelse(df$OUTCOME=="dism_invol_jud_def", 1, 0)

# Convert block to factor variables.
df$block <- as.factor(df$block)
df$president <- as.factor(df$president)
df$jid <- as.factor(df$jid)

# What are the variables we will use in the analysis?
ovars <- c("settlement", "dism_invol_jud_def")
jvars <- c("jid", "democrat", "republican", "traditional" ,"president")
tvars <- list("democrat" = c("nontraditional", "nonwhite", "black", "latino", "woman", "white_woman", "nonwhite_woman", "nonwhite_man"),
              "republican" = c("nontraditional", "nonwhite", "black", "latino", "woman", "white_woman", "nonwhite_man"))
bvars <- c("DISTRICT", "block")

# Only keep columns we need.
df <- df[, c("to_drop", "OPEN_ID", "df_main", "df_scales", ovars, jvars, 
             unname(unique(unlist(tvars))), bvars)] 

# Only keep rows in the main dataset or the SCALES dataset.
df <- df[df$df_main==1 | df$df_scales==1,]

# Merge in the litigant race and gender information.
lvars <- c("nonwhite", "female", "white_woman", "nonwhite_man")
lf <- distinct(df[(df$df_main==1),c("OPEN_ID")])
for(z in c("w","p")){
  lf1 <- read_csv(paste0(wdir,"/outs/chp_apsr_plaintiff_identities.csv"), show_col_types = FALSE)
  names(lf1) <- gsub("pla_", "", names(lf1))
  lvars1 <- colnames(lf1)[colnames(lf1) %in% paste0(lvars,"_",z) | colnames(lf1)=="female"]
  lf1 <- lf1[,c("OPEN_ID", paste0("white_",z), lvars1)]
  lf1[(lf1 > 0) & (lf1 < 1)] <- NA
  lf1 <- rename(lf1, woman = female)
  lf1[[paste0("white_woman_",z)]] <- lf1$woman * lf1[[paste0("white_",z)]]
  lf1[[paste0("nonwhite_woman_",z)]] <- lf1$woman * lf1[[paste0("nonwhite_",z)]]
  lf1[[paste0("nonwhite_man_",z)]] <- (1-lf1$woman) * lf1[[paste0("nonwhite_",z)]]
  names(lf1)[2:ncol(lf1)] <- paste0("pla_", names(lf1)[2:ncol(lf1)])
  lf <- left_join(lf, lf1[,c("OPEN_ID", colnames(lf1)[!colnames(lf1) %in% colnames(lf)])], by = join_by(OPEN_ID))
  rm(lf1)
}
df <- left_join(df, lf, by = join_by(OPEN_ID))
rm(lf)

lvars <- gsub("female", "woman", lvars)

################################################################################
# Define the analyses to be completed
################################################################################

# Create a dataframe that serves as the index for all the different analyses
# we run for the article, including robustness checks. 

af <- NULL

# Partisan effects
af <- bind_rows(af, expand.grid("republican", 
                                ovars, 
                                0,
                                NA,
                                NA,
                                NA, 
                                0, 
                                stringsAsFactors = FALSE))

# Main analyses
af <- bind_rows(af, expand.grid("nontraditional",
                                ovars, 
                                0,
                                c("republican","democrat"),
                                NA, 
                                NA, 
                                0, 
                                stringsAsFactors = FALSE))

# SCALES dataset
af <- bind_rows(af, expand.grid("nontraditional",
                                ovars, 
                                1,
                                c("republican","democrat"),
                                NA, 
                                NA, 
                                0, 
                                stringsAsFactors = FALSE)) # pres controls

# Subgroup analyses
for(p in c("democrat", "republican")){
  af <- bind_rows(af, expand.grid(tvars[[p]][2:length(tvars[[p]])],
                                  c("settlement"), 
                                  0,
                                  p,
                                  NA, 
                                  NA, 
                                  0, 
                                  stringsAsFactors = FALSE))
  af <- bind_rows(af, expand.grid(tvars[[p]][2:length(tvars[[p]])],
                                  c("dism_invol_jud_def"), 
                                  0,
                                  p,
                                  NA, 
                                  NA, 
                                  0, 
                                  stringsAsFactors = FALSE))
}

# Heterogeneous effects
af <- bind_rows(af, expand.grid(c("nonwhite","woman"), 
                                c("settlement"), 
                                0,
                                c("republican","democrat"),
                                c(0,1), 
                                "w", 
                                0, 
                                stringsAsFactors = FALSE))

af <- bind_rows(af, expand.grid(lvars, 
                                c("settlement"), 
                                0,
                                c("republican","democrat"),
                                c(0,1), 
                                c("w","p"), 
                                0, 
                                stringsAsFactors = FALSE))


# Robustness: president controls
af <- bind_rows(af, expand.grid("nontraditional",  
                                ovars, 
                                0,
                                c("republican","democrat"),
                                NA,
                                NA, 
                                1,
                                stringsAsFactors = FALSE))


colnames(af) <- c("Treatment", "Outcome", "SCALES", "Party", "Plaintiff_Shares", "Race_Coding", "President_Controls")

# Fix the gender coding
af <- af[!(!is.na(af$Race_Coding) & af$Race_Coding=="p" & af$Treatment == "woman"),]
af$Race_Coding[!is.na(af$Race_Coding) & af$Race_Coding=="w" & af$Treatment == "woman"] <- ""

# Give each analysis a unique ID number
af <- distinct(af)
af$Model_ID <- paste0("", str_pad(seq(1,nrow(af)), 3, "left", "0"))

################################################################################
# Define the steps of the analysis
################################################################################

# Define a function that completes all the steps of the analysis. It will take 
# as its argument a number corresponding to a specific analysis defined in the 
# analysis index dataframe `af` defined above.

# This function will be used to parallelize the execution of all the analyses.
# However it can be run individually on each row of `af` by simply running 
# `rfun` with x equaling the relevant row number.
rfun <- function(x){
  
  tryCatch({
    
    # Extract key variables from the relevant row of `af`
    t <- as.character(af$Treatment[x]) # treatment variable
    c <- ifelse(t == "republican", "democrat", "traditional") # control variable
    y <- as.character(af$Outcome[x]) # outcome variable
    s <- af$SCALES[x] # analyze the SCALES dataset?
    p <- ifelse(is.na(as.character(af$Party[x])), "partisan", as.character(af$Party[x])) # what subset?
    hte1 <- as.character(af$Plaintiff_Shares[x]) # heterogeneous treatment analysis
    hte2 <- as.character(af$Race_Coding[x]) # which race coding? wru or predictrace?
    pc <- as.character(af$President_Controls[x]) # use president controls?
    m <- as.character(af$Model_ID[x]) # what is the ID number for this analysis?
    
    # Create a name for the subset we are analyzing
    mname <- p 
    
    ##############################################################################
    # Identify the subset of cases for the analysis
    ##############################################################################
    
    # Are we analyzing the main dataset or the SCALES dataset?
    if(s == 1){
      mask <- (df[["df_scales"]] == s)  
    } else if(s == 0){
      mask <- (df[["df_main"]] == 1-s)  
    }
    
    # Are we running the partisan effects or the identity effects?
    if(p != "partisan"){
      mask <- mask & (df[[p]] == 1)
    }
    
    # Are we running the effects on subsets of plaintiffs? 
    if(grepl("[01]", hte1)){
      subset.var.name <- gsub("[_]$","",paste0("pla_", t, "_", hte2))
      mask <- mask & (!is.na(df[[subset.var.name]])) & (df[[subset.var.name]] == hte1)
      mname <- paste0(mname,"/", subset.var.name, "=", hte1)
    }
    
    # Clean 1: each observation should be in treatment or control.
    mask <- mask & (rowSums(df[, c(t,c)]) == 1)
    
    # Clean 2: ensure minimum observations in treatment * block (at least 5 per block).
    tmp <- summarise_at(group_by(df[mask,], block), vars(!!sym(t), !!sym(c)), sum)
    tmp <- tmp[(tmp[[t]] >= 5) & (tmp[[c]] >= 5),]
    
    # Create the final mask of the dataset that we will use for the regression.
    mask <- mask & (df[["block"]] %in% tmp[["block"]])
    
    # If there are fewer than 20 judges in the treatment group or control group
    # then we do not present estimates. See paper for more discussion of the 
    # problem of drawing inferences from small numbers of judges.
    tmp <- summarize(group_by(df[mask,], !!sym(t)), n = n_distinct(jid))
    if(nrow(tmp[tmp$n<20,])>0){
      stop("Too few judges!")
    }
    
    # Keep a record of the subset of cases used for this analysis
    if(to.save){
      mf1 <- bind_rows(mutate(df[mask,c("OPEN_ID")], t = eval(t), c = eval(c), mask = eval(mname), model_id = m))
      write_csv(mf1, file = paste0(cdir, "/masks/", m,".csv"))
      rm(mf1)
    }
  
    ##############################################################################
    # Estimate the effects
    ##############################################################################
    
    # Define two formulas: one with the outcome and treatment variable, and the 
    # other with the controls (randomization blocks and president controls).
    my.ate <- formula(paste0(y, ' ~ ', t))
    my.cov <- formula(ifelse(pc=="0", "~ block", "~ block + president"))
    
    # Estimate the model with judge clustered standard errors.
    dim_out <- lm_lin(formula = my.ate, covariates = my.cov, data = df[mask,], clusters = jid)
    
    # While all models produce clustered standard error estimates for the main
    # treatment variable, the models run on the cases heard by Democratic 
    # appointees in the SCALES dataset do not produce standard error estimates 
    # for some of the randomization blocks. This is likely due to the unusual 
    # structure of our data: the judge clusters we use to estimate standard 
    # errors are spread across the randomization blocks in ways that sometimes
    # create estimation problems. When we estimate these models, we will check 
    # whether this problem arises and implement a fix, dropping three small 
    # districts court with the smallest caseload, after which we can reliably 
    # estimate standard errors.
    
    # First, check if there is an estimation problem. This only returns
    # TRUE for cases heard by Democratic appointees in the SCALES dataset.
    se.est.problem <- ifelse(any(is.na(dim_out$std.error)), TRUE, FALSE)
    if(se.est.problem){
      # Drop cases heard by Democratic appointees in the Eastern and Western 
      # Districts of Arkansas, as well as the Northern District of Alabama.
      # This drops 331 cases, or 1.3% of the cases.
      mask <- mask & !(df$DISTRICT %in% c("ared", "arwd", "alnd"))
      dim_out <- lm_lin(formula = my.ate, covariates = my.cov, data = df[mask,], clusters = jid)
      # Check again if there is any remaining estimation problem. If so, return
      # an error. This should not return an error!
      se.est.problem <- ifelse(any(is.na(dim_out$std.error)), TRUE, FALSE)
      if(se.est.problem){
        stop("Standard error estimation problem!")
      }
    }
    
    ##############################################################################
    # Save and return a row
    ##############################################################################
    
    # What is the treatment variable called?
    treat.control <- gsub('^([^ ]+) .+','\\1', dim_out$term[2])
    
    # Create a row that contains all the relevant information we wish to report
    # in the paper and in the appendix.
    row <- c(dim_out$coefficients[[treat.control]],
             dim_out$std.error[[treat.control]],
             dim_out$statistic[[treat.control]], 
             dim_out$p.value[[treat.control]],
             dim_out$conf.low[[treat.control]], 
             dim_out$conf.high[[treat.control]],
             dim_out$df[[treat.control]], 
             dim_out$nobs, 
             sum(df[mask,][[t]]),
             sum(1-df[mask,][[t]]),
             length(unique(df$jid[mask & df[[t]]==1])),
             length(unique(df$jid[mask & df[[t]]==0])),
             length(unique(df$block[mask])), 
             y, 
             mname, 
             ifelse(s==1,"scales","main"), 
             t,
             paste0('judge_clustered'))
    row <- as.data.frame(rbind(row))
    colnames(row) <- c('effect', 'se', 'tval', 'pval', 'ciL', 'ciH', 'df',
                       'obs', 'obs_treat', 'obs_control', 'judges_treat',  'judges_control', 'fe_count', 
                       'outcome', 'mask', 'dataset', 'treatment', 'se_type')
    rownames(row) <- NULL
    row[1:7] <- round(as.numeric(row[1:7]),5)
    row[8:13] <- as.numeric(row[8:13])
    
    row$pres_controls <- pc
    row$model_id <- m
    
    # Save the row with the information to report in the paper/appendix.
    if(to.save){
      # Save the full regression output for this analysis
      rfile <- paste0(cdir, "/regs/", m,".csv")
      write_csv(as_tibble(tidy(dim_out)), file = rfile)
      # Save the row for the treatment variable
      rfile <- paste0(cdir, "/rows/", m,".csv")
      write_csv(row, file = rfile)
    }
    return(row)
  
  }, error = function(e) {
    write(as.character(e), file=paste0(cdir, "/_errs/", af$Model_ID[x]))
  })
}

################################################################################
# Run all of the models included in the paper and appendix
################################################################################

# Conduct the main analyses
torun <- which(!af$Treatment == "republican" & !af$Model_ID %in% completed)
print(paste0("Using ", ncores, " cores to run ", length(torun), 
             " regressions. This may take a while..."))
q <- Sys.time()
ef <- mclapply(torun, rfun, mc.cores = ncores)
print(Sys.time() - q)

# Conduct the partisan analyses
# (Note: have to do these in sequence because each regression takes too much memory for parallel processing.)
torun <- which(af$Treatment == "republican" & !af$Model_ID %in% completed)
for(i in torun){
  print(i)
  q <- Sys.time()
  ef <- rfun(i)
  print(Sys.time() - q)
}

# Save a copy of the index of the analyses completed
af$ExecutionReport <- "Normal"
af$ExecutionReport[af$Model_ID %in% list.files(paste0(cdir, "_errs/"))] <- "Execution Error"
write_csv(af, file = paste0(cdir, "/model_index.csv"))